#Although I was able to obtain the data from AWS, the Aslett instance would not allow me to install packages therefore I moved the code to my desktop instance
#I was unable to knit the file while pointing to AWS therefore I saved CSV files that I then called off my local directory
#install.packages("RCurl")
#library("RCurl")
#library(jsonlite)
#cs2 <- read.table(textConnection(getURL(
# "https://smuddsproject2.s3-us-east-2.amazonaws.com/CaseStudy2-data.csv"
#)), sep=",", header=TRUE)
#install.packages("RCurl")
#library("RCurl")
#library(jsonlite)
#cs2 <- read.table(textConnection(getURL(
# "https://smuddsproject2.s3-us-east-2.amazonaws.com/CaseStudy2-data.csv"
#)), sep=",", header=TRUE)
#library(tidyverse)
#library(aws.s3)
#library(readxl)
#Sys.setenv("AWS_ACCESS_KEY_ID" = "AKIARVCEVL4C3YMB7QUW",
# "AWS_SECRET_ACCESS_KEY" = "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",
# "AWS_DEFAULT_REGION" = "us-east-2")
# Using aws.s3
#aws.s3::bucketlist()
#aws.s3::get_bucket("smuddsproject2")
#Read in Data
#cs2cmpst = s3read_using(FUN = read.csv,
# bucket = "smuddsproject2",
# object = "CaseStudy2CompSet No Attrition.csv")
#cs2cmpns = s3read_using(FUN = read_excel,
# bucket = "smuddsproject2",
# object = "CaseStudy2CompSet No Salary.xlsx")
#Save the data obtained from AWS to my desktop/cloud to process #By saving these I was then unable to call the files and then knit the RMD file for completion #My workaround in the eleventh hour!
#Write to CSV files
#write.csv(cs2, "cs2.csv", row.names = F)
#write.csv(cs2cmpst, "cs2cmpst.csv", row.names = F)
#write.csv(cs2cmpns, "cs2cmpns.csv", row.names = F)
#Reading the data off my local directory
cs2 = read.csv("/Users/amyadyanthaya/Desktop/DS6306/cs2/cs2.csv", encoding = "UTF-8")
cs2cmpns = read.csv("/Users/amyadyanthaya/Desktop/DS6306/cs2/cs2.csv", encoding = "UTF-8")
cs2cmpst = read.csv("/Users/amyadyanthaya/Desktop/DS6306/cs2/cs2.csv", encoding = "UTF-8")
This section is to look at the individual variables to determine if they impact Attrition Those variables that showed no impact the plots were removed from this section
#Look for missing observations
anyNA(cs2)
## [1] FALSE
#CONTINIOUS VARIABLES
# Age
cs2 %>%
ggplot(aes(x=Age, color=Attrition, fill=Attrition)) +
geom_histogram() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Age by Attrition") +
xlab("Age (Years)") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cs2 %>%
group_by(Attrition) %>%
summarize(medianAge = median(Age)) %>%
ggplot(aes(x=Attrition, y=medianAge, fill = Attrition)) +
geom_col() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Median Age by Attrition") +
xlab("Attrition")+ ylab("Median Age (Years)") +
theme(legend.position = "none")
# DistanceFromHome
cs2 %>%
ggplot(aes(x=DistanceFromHome, color=Attrition, fill=Attrition)) +
geom_histogram() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Distance from Home by Attrition") +
xlab("Distance From Home") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cs2 %>%
group_by(Attrition) %>%
summarize(medianDistanceFromHome = median(DistanceFromHome),count=n()) %>%
ggplot(aes(x=Attrition, y=medianDistanceFromHome, fill = Attrition)) +
geom_col() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Median Distance From Home by Attrition") +
xlab("Attrition")+ ylab("Median Distance from Home") +
theme(legend.position = "none")
# NumCompaniesWorked
cs2 %>%
ggplot(aes(x=NumCompaniesWorked, color=Attrition, fill=Attrition)) +
geom_histogram() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Number of Companies Worked by Attrition") +
xlab("Number of Companies Worked") + ylab("") + xlim(0,8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 33 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 4 rows containing missing values (`geom_bar()`).
cs2 %>%
group_by(Attrition) %>%
summarize(medianNumCompaniesWorked = median(NumCompaniesWorked),count=n()) %>%
ggplot(aes(x=Attrition, y=medianNumCompaniesWorked, fill=Attrition)) +
geom_col() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Median Number of Companies Worked by Attrition") +
xlab("Attrition")+ ylab("Median Number of Companies Worked") +
theme(legend.position = "none")
# YearsAtCompany
cs2 %>%
ggplot(aes(x=YearsAtCompany, color=Attrition, fill=Attrition)) +
geom_histogram() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Years at Company by Attrition") +
xlab("Years at Company") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cs2 %>%
group_by(Attrition) %>%
summarize(medianYearsAtCompany = median(YearsAtCompany),count=n()) %>%
ggplot(aes(x=Attrition, y=medianYearsAtCompany, fill=Attrition)) +
geom_col() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Median Years at Company by Attrition") +
xlab("Attrition")+ ylab("Median Years at Company") +
theme(legend.position = "none")
# YearsInCurrentRole
cs2 %>%
ggplot(aes(x=YearsInCurrentRole, color=Attrition, fill=Attrition)) +
geom_histogram() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Years in Current Role by Attrition") +
xlab("Years in Current Role") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cs2 %>%
group_by(Attrition) %>%
summarize(medianYearsInCurrentRole = median(YearsInCurrentRole),count=n()) %>%
ggplot(aes(x=Attrition, y=medianYearsInCurrentRole, fill=Attrition)) +
geom_col() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Median Years in Current Role by Attrition") +
xlab("Attrition")+ ylab("Median Years in Current Role") +
theme(legend.position = "none")
# YearsWithCurrManager
cs2 %>%
ggplot(aes(x=YearsWithCurrManager, color=Attrition, fill=Attrition)) +
geom_histogram() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Years with Current Manager by Attrition") +
xlab("Years with Current Manager") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cs2 %>%
group_by(Attrition) %>%
summarize(medianYearsWithCurrManager = median(YearsWithCurrManager),count=n()) %>%
ggplot(aes(x=Attrition, y=medianYearsWithCurrManager, fill=Attrition)) +
geom_col() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Median Years with Current Manager by Attrition") +
xlab("Attrition")+ ylab("Median Years with Current Manager") +
theme(legend.position = "none")
#FACTORS
# Education
cs2$EducationF = as.factor(cs2$Education)
#summary(cs2$EducationF)
cs2 %>%
ggplot(aes(x=EducationF, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Education by Attrition") +
xlab("Education") + ylab("")
# EnvironmentSatisifaction
cs2$EnvironmentSatisfactionF = as.factor(cs2$EnvironmentSatisfaction)
#summary(cs2$EnvironmentSatisfactionF)
cs2 %>%
ggplot(aes(x=EnvironmentSatisfactionF, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Environment Satisfaction by Attrition") +
xlab("Environment Satisfaction") + ylab("")
# JobInvolvement
cs2$JobInvolvementF = as.factor(cs2$JobInvolvement)
#summary(cs2$JobInvolvementF)
cs2 %>%
ggplot(aes(x=JobInvolvementF, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Job Involvement by Attrition") +
xlab("Job Involvement") + ylab("")
# JobLevel
cs2$JobLevelF = as.factor(cs2$JobLevel)
cs2 %>%
ggplot(aes(x=JobLevelF, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Job Level by Attrition") +
xlab("Job Level") + ylab("")
# JobSatification
cs2$JobSatisfactionF = as.factor(cs2$JobSatisfaction)
cs2 %>%
ggplot(aes(x=JobSatisfactionF, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Job Satisfaction by Attrition") +
xlab("Job Satisfaction") + ylab("")
# PerformanceRating
cs2$PerformanceRatingF = as.factor(cs2$PerformanceRating)
cs2 %>%
ggplot(aes(x=PerformanceRatingF, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Performance Rating by Attrition") +
xlab("Performance Rating") + ylab("")
# RelationshipSatification
cs2$RelationshipSatisfactionF = as.factor(cs2$RelationshipSatisfaction)
#summary(cs2$RelationshipSatisfactionF)
cs2 %>%
ggplot(aes(x=RelationshipSatisfactionF, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Relationship Satisfaction by Attrition") +
xlab("Relationship Satisfaction") + ylab("")
# StockOptionLevel
cs2$StockOptionLevelF = as.factor(cs2$StockOptionLevel)
cs2 %>%
ggplot(aes(x=StockOptionLevelF, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Stock Option Level by Attrition") +
xlab("Stock Option Level") + ylab("")
# WorkLifeBalance
cs2$WorkLifeBalanceF = as.factor(cs2$WorkLifeBalance)
cs2 %>%
ggplot(aes(x=WorkLifeBalanceF, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Work Life Balance by Attrition") +
xlab("Work Life Balance") + ylab("")
#CATEGORICAL
# BusinessTravel
cs2 %>%
proc_freq("BusinessTravel", "Attrition",
include.row_percent = FALSE,
include.column_percent = FALSE)
Attrition | ||||
|---|---|---|---|---|
BusinessTravel | label | No | Yes | Total |
Non-Travel | Frequency | 83 | 11 | 94 |
Percent | 9.54% | 1.26% | 10.8% | |
Travel_Frequently | Frequency | 123 | 35 | 158 |
Percent | 14.14% | 4.02% | 18.16% | |
Travel_Rarely | Frequency | 524 | 94 | 618 |
Percent | 60.23% | 10.8% | 71.03% | |
Total | Frequency | 730 | 140 | 870 |
Percent | 83.91% | 16.09% | ||
cs2 %>%
ggplot(aes(x=BusinessTravel, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Business Travel by Attrition") +
xlab("Business Travel") + ylab("")
# Department
cs2 %>%
proc_freq("Department", "Attrition",
include.row_percent = FALSE,
include.column_percent = FALSE)
Attrition | ||||
|---|---|---|---|---|
Department | label | No | Yes | Total |
Human Resources | Frequency | 29 | 6 | 35 |
Percent | 3.33% | 0.69% | 4.02% | |
Research & Development | Frequency | 487 | 75 | 562 |
Percent | 55.98% | 8.62% | 64.6% | |
Sales | Frequency | 214 | 59 | 273 |
Percent | 24.6% | 6.78% | 31.38% | |
Total | Frequency | 730 | 140 | 870 |
Percent | 83.91% | 16.09% | ||
cs2 %>%
ggplot(aes(x=Department, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution Department by Attrition") +
xlab("Department") + ylab("")
# EducationField
cs2 %>%
proc_freq("EducationField", "Attrition",
include.row_percent = FALSE,
include.column_percent = FALSE)
Attrition | ||||
|---|---|---|---|---|
EducationField | label | No | Yes | Total |
Human Resources | Frequency | 11 | 4 | 15 |
Percent | 1.26% | 0.46% | 1.72% | |
Life Sciences | Frequency | 305 | 53 | 358 |
Percent | 35.06% | 6.09% | 41.15% | |
Marketing | Frequency | 80 | 20 | 100 |
Percent | 9.2% | 2.3% | 11.49% | |
Medical | Frequency | 233 | 37 | 270 |
Percent | 26.78% | 4.25% | 31.03% | |
Other | Frequency | 43 | 9 | 52 |
Percent | 4.94% | 1.03% | 5.98% | |
Technical Degree | Frequency | 58 | 17 | 75 |
Percent | 6.67% | 1.95% | 8.62% | |
Total | Frequency | 730 | 140 | 870 |
Percent | 83.91% | 16.09% | ||
cs2 %>%
ggplot(aes(x=EducationField, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution Education Field by Attrition") +
xlab("Education Field") + ylab("")
# Gender
cs2 %>%
proc_freq("Gender", "Attrition",
include.row_percent = FALSE,
include.column_percent = FALSE)
Attrition | ||||
|---|---|---|---|---|
Gender | label | No | Yes | Total |
Female | Frequency | 301 | 53 | 354 |
Percent | 34.6% | 6.09% | 40.69% | |
Male | Frequency | 429 | 87 | 516 |
Percent | 49.31% | 10% | 59.31% | |
Total | Frequency | 730 | 140 | 870 |
Percent | 83.91% | 16.09% | ||
cs2 %>%
ggplot(aes(x=Gender, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Gender by Attrition") +
xlab("Gender") + ylab("")
# MaritalStatus
cs2 %>%
proc_freq("MaritalStatus", "Attrition",
include.row_percent = FALSE,
include.column_percent = FALSE)
Attrition | ||||
|---|---|---|---|---|
MaritalStatus | label | No | Yes | Total |
Divorced | Frequency | 179 | 12 | 191 |
Percent | 20.57% | 1.38% | 21.95% | |
Married | Frequency | 352 | 58 | 410 |
Percent | 40.46% | 6.67% | 47.13% | |
Single | Frequency | 199 | 70 | 269 |
Percent | 22.87% | 8.05% | 30.92% | |
Total | Frequency | 730 | 140 | 870 |
Percent | 83.91% | 16.09% | ||
cs2 %>%
ggplot(aes(x=MaritalStatus, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Marital Status by Attrition") +
xlab("Marital Status") + ylab("")
# Overtime
cs2 %>%
proc_freq("OverTime", "Attrition",
include.row_percent = FALSE,
include.column_percent = FALSE)
Attrition | ||||
|---|---|---|---|---|
OverTime | label | No | Yes | Total |
No | Frequency | 558 | 60 | 618 |
Percent | 64.14% | 6.9% | 71.03% | |
Yes | Frequency | 172 | 80 | 252 |
Percent | 19.77% | 9.2% | 28.97% | |
Total | Frequency | 730 | 140 | 870 |
Percent | 83.91% | 16.09% | ||
cs2 %>%
ggplot(aes(x=OverTime, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Overtime by Attrition") +
xlab("Marital Status") + ylab("")
After looking at the relationship between Attrition and individual variables it was determined that TotalWorkingYears, MonthlyIncome and JobRole were the top three factors that contribute to Attrition
cs2 %>%
select(TotalWorkingYears, MonthlyIncome, JobRole) %>%
ggpairs(mapping = aes(color = cs2$Attrition)) +
scale_fill_viridis(discrete = TRUE) +
scale_color_viridis(discrete = TRUE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Impact of TotalWorkingYears on Attrition
cs2 %>%
group_by(Attrition) %>%
summarize(meanTWH = mean(TotalWorkingYears),
sdTWH = sd(TotalWorkingYears),
medianTWH = median(TotalWorkingYears),
rangeTWH = max(TotalWorkingYears)-min(TotalWorkingYears),
IQRTWH = IQR(TotalWorkingYears),
countTWH = n())
## # A tibble: 2 × 7
## Attrition meanTWH sdTWH medianTWH rangeTWH IQRTWH countTWH
## <chr> <dbl> <dbl> <dbl> <int> <dbl> <int>
## 1 No 11.6 7.46 10 37 9 730
## 2 Yes 8.19 7.16 6.5 40 7 140
cs2 %>%
ggplot(aes(x=TotalWorkingYears, fill=Attrition)) +
geom_histogram() + scale_fill_viridis(discrete = TRUE) +
ggtitle("Impact of Total Working Years on Attrition") +
xlab("Total Working Years") +
ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cs2 %>%
ggplot(aes(y=TotalWorkingYears, color=Attrition)) +
geom_boxplot() +
scale_color_viridis(discrete = TRUE) +
ggtitle("Comparison of Total Working Years and Attrition") +
ylab("Total Working Years")
#Create factor cuts
cs2$TWYcut = cut(cs2$TotalWorkingYears, breaks = c(0,5,10,15,20,25,30,35,100), labels = c("< 5","5-10","10-15","15-20","20-25","25-30","30-35",">35"), right=FALSE)
freqTWY <- cs2 %>%
proc_freq("TWYcut", "Attrition",
include.row_percent = FALSE,
include.column_percent = FALSE,
include.table_percent = FALSE,
include.column_total = FALSE,
include.row_total = FALSE,
)
freqTWY$body$dataset$PctAtt <- (freqTWY$body$dataset$Yes/(freqTWY$body$dataset$Yes + freqTWY$body$dataset$No))*100
freqTWY$body$dataset
## TWYcut label No Yes PctAtt
## 1 < 5 Frequency 91 46 33.576642
## 2 5-10 Frequency 241 46 16.027875
## 3 10-15 Frequency 196 26 11.711712
## 4 15-20 Frequency 85 11 11.458333
## 5 20-25 Frequency 65 6 8.450704
## 6 25-30 Frequency 25 2 7.407407
## 7 30-35 Frequency 20 2 9.090909
## 8 >35 Frequency 7 1 12.500000
cs2 %>%
ggplot(aes(x=TWYcut, color=Attrition, fill=Attrition)) +
geom_bar() +
scale_color_viridis(discrete = TRUE) +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Impact of Attrition per Total Working Years") +
xlab("Total Working Years") + ylab("")
#Not a normal distribution so use KW test
# H_0: The poulation distributions contain similar medians for Attrition
# H_A: At least one population distribution differs for attrition
kruskal.test(TotalWorkingYears~Attrition, data=cs2)
##
## Kruskal-Wallis rank sum test
##
## data: TotalWorkingYears by Attrition
## Kruskal-Wallis chi-squared = 34.605, df = 1, p-value = 4.038e-09
Reject Ho, there is overwhelming evidence that suggests Total Working Years has an impact on the outcome of Attrition. 33.6% of attritions occur less then five total working years with nearly 50% of attritions occuring within 10 years of total working.
#Impact of MonthlyIncome on Attrition
cs2 %>%
group_by(Attrition) %>%
summarize(meanMI = mean(MonthlyIncome),
medianMI = median(MonthlyIncome),
rangeMI = max(MonthlyIncome)-min(MonthlyIncome),
IQRMI = IQR(MonthlyIncome),
IQRMI = IQR(MonthlyIncome),
countMI = n())
## # A tibble: 2 × 6
## Attrition meanMI medianMI rangeMI IQRMI countMI
## <chr> <dbl> <dbl> <int> <dbl> <int>
## 1 No 6702 5208. 18870 5574. 730
## 2 Yes 4765. 3171 18778 3497. 140
cs2 %>%
ggplot(aes(x=MonthlyIncome, fill=Attrition)) + geom_histogram() +
scale_fill_viridis(discrete = TRUE) +
scale_color_viridis(discrete = TRUE) +
ggtitle("Impact of Monthly Income on Attrition") +
xlab("Monthly Income") + ylab("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cs2 %>%
ggplot(aes(x=MonthlyIncome, color=Attrition)) + geom_boxplot() + scale_color_viridis(discrete = TRUE) +
ggtitle("Impact of Monthly Income and Attrition") + xlab("Monthly Income") + ylab("")
#Not a normal distribution so use KW test
# H_0: The poulation distributions contain similar medians for Attrition
# H_A: At least one population distribution differs for attrition
kruskal.test(MonthlyIncome~Attrition, data=cs2)
##
## Kruskal-Wallis rank sum test
##
## data: MonthlyIncome by Attrition
## Kruskal-Wallis chi-squared = 34.59, df = 1, p-value = 4.069e-09
Reject Ho, there is overwhelming evidence that suggests MonthlyIncome has an impact on the outcome of Attrition. It appears that more attrition occurs with a lower monthly income. The median monthly income is 24% lower for employee attrtion, compared to non-attrition.
#Impact of Job Involvement on Attrition
cs2 %>%
ggplot(aes(x=JobInvolvementF, color=Attrition, fill=Attrition)) +
geom_bar() + scale_fill_viridis(discrete = TRUE) +
ggtitle("Distribution of Job Involvement and Attrition") +
xlab("Job Involvement") + ylab("")
freqJI <- cs2 %>%
proc_freq("JobInvolvementF", "Attrition",
include.row_percent = FALSE,
include.column_percent = FALSE,
include.table_percent = FALSE,
include.column_total = FALSE,
include.row_total = FALSE,
)
freqJI$body$dataset$PctAtt <- (freqJI$body$dataset$Yes/(freqJI$body$dataset$Yes + freqJI$body$dataset$No))*100
freqJI$body$dataset
## JobInvolvementF label No Yes PctAtt
## 1 1 Frequency 25 22 46.808511
## 2 2 Frequency 184 44 19.298246
## 3 3 Frequency 447 67 13.035019
## 4 4 Frequency 74 7 8.641975
freqJIc <- cbind(freqJI$body$dataset$Yes, freqJI$body$dataset$No)
#Ho: Attrition is independent of Job Involvement
#Ha: Attriton is not independent of Job Involvement
chisq.test(freqJIc)
##
## Pearson's Chi-squared test
##
## data: freqJIc
## X-squared = 41.465, df = 3, p-value = 5.211e-09
Reject Ho, there is overwhelming evidence that suggests Job Involvement has an impact on the outcome of Attrition. The data does not provide text explaining each job involvement level however that it may be assumed that desirable job involvement increase with each level. Almost 50% of the attrition occurs at the lowest level of job involvement (1), which includes the least number of employees.
#Impact of Job Role on Attrition
cs2 %>%
ggplot(aes(x=JobRole, color=Attrition, fill=Attrition)) +
geom_bar() + scale_fill_viridis(discrete = TRUE) +
scale_x_discrete(guide = guide_axis(angle = 60)) +
ggtitle("Distribution of Job Role and Attrition") +
xlab("Job Role") + ylab("")
freqJR <- cs2 %>%
proc_freq("JobRole", "Attrition",
include.row_percent = FALSE,
include.column_percent = FALSE,
include.table_percent = FALSE,
include.column_total = FALSE,
include.row_total = FALSE,
)
freqJR$body$dataset$PctAtt <- (freqJR$body$dataset$Yes/(freqJR$body$dataset$Yes + freqJR$body$dataset$No))*100
freqJR$body$dataset
## JobRole label No Yes PctAtt
## 1 Healthcare Representative Frequency 68 8 10.526316
## 2 Human Resources Frequency 21 6 22.222222
## 3 Laboratory Technician Frequency 123 30 19.607843
## 4 Manager Frequency 47 4 7.843137
## 5 Manufacturing Director Frequency 85 2 2.298851
## 6 Research Director Frequency 50 1 1.960784
## 7 Research Scientist Frequency 140 32 18.604651
## 8 Sales Executive Frequency 167 33 16.500000
## 9 Sales Representative Frequency 29 24 45.283019
freqJRc <- cbind(freqJR$body$dataset$Yes, freqJR$body$dataset$No)
#Ho: Attrition is independent of Job Role
#Ha: Attriton is not independent of Job Role
chisq.test(freqJRc)
## Warning in chisq.test(freqJRc): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: freqJRc
## X-squared = 60.543, df = 8, p-value = 3.647e-10
Reject Ho, there is overwhelming evidence that suggests Job Role has an impact on the outcome of Attrition. Almost half (45.3%) of the attrition occurs for the Sales Representatives.
#Fit a model with the variables to identify those that may assist with the prediciting of the model
#Kitchen Sink
cs2$AttritionT <-ifelse(cs2$Attrition=="Yes",1,0)
fitR = lm(AttritionT ~ MonthlyIncome+TotalWorkingYears+JobRole+JobSatisfactionF+WorkLifeBalanceF+PercentSalaryHike+YearsSinceLastPromotion+EducationF+EducationField+EmployeeNumber+EnvironmentSatisfaction+Gender+PerformanceRatingF+Department+DistanceFromHome+Age+BusinessTravel+JobInvolvementF+JobLevelF+MaritalStatus+NumCompaniesWorked+OverTime+RelationshipSatisfactionF+StockOptionLevelF+TrainingTimesLastYear+YearsAtCompany+YearsInCurrentRole+YearsSinceLastPromotion+YearsWithCurrManager + HourlyRate + MonthlyRate + PercentSalaryHike + DailyRate, data=cs2)
summary(fitR)
##
## Call:
## lm(formula = AttritionT ~ MonthlyIncome + TotalWorkingYears +
## JobRole + JobSatisfactionF + WorkLifeBalanceF + PercentSalaryHike +
## YearsSinceLastPromotion + EducationF + EducationField + EmployeeNumber +
## EnvironmentSatisfaction + Gender + PerformanceRatingF + Department +
## DistanceFromHome + Age + BusinessTravel + JobInvolvementF +
## JobLevelF + MaritalStatus + NumCompaniesWorked + OverTime +
## RelationshipSatisfactionF + StockOptionLevelF + TrainingTimesLastYear +
## YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion +
## YearsWithCurrManager + HourlyRate + MonthlyRate + PercentSalaryHike +
## DailyRate, data = cs2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57471 -0.19881 -0.06755 0.11703 1.10963
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.849e-01 2.073e-01 4.269 2.20e-05 ***
## MonthlyIncome -2.473e-06 1.081e-05 -0.229 0.819111
## TotalWorkingYears -4.655e-03 3.387e-03 -1.374 0.169759
## JobRoleHuman Resources 7.900e-02 1.546e-01 0.511 0.609582
## JobRoleLaboratory Technician -1.975e-02 5.728e-02 -0.345 0.730286
## JobRoleManager -3.975e-02 9.699e-02 -0.410 0.682025
## JobRoleManufacturing Director -5.312e-02 5.052e-02 -1.051 0.293370
## JobRoleResearch Director -1.219e-01 7.868e-02 -1.550 0.121617
## JobRoleResearch Scientist -8.051e-02 5.753e-02 -1.399 0.162074
## JobRoleSales Executive 3.477e-02 1.065e-01 0.326 0.744264
## JobRoleSales Representative 1.491e-01 1.202e-01 1.241 0.214990
## JobSatisfactionF2 -4.279e-02 3.440e-02 -1.244 0.213898
## JobSatisfactionF3 -4.797e-02 3.111e-02 -1.542 0.123462
## JobSatisfactionF4 -1.242e-01 3.075e-02 -4.039 5.88e-05 ***
## WorkLifeBalanceF2 -1.699e-01 5.135e-02 -3.308 0.000980 ***
## WorkLifeBalanceF3 -1.906e-01 4.826e-02 -3.949 8.52e-05 ***
## WorkLifeBalanceF4 -1.889e-01 5.632e-02 -3.353 0.000836 ***
## PercentSalaryHike 2.151e-03 4.692e-03 0.458 0.646810
## YearsSinceLastPromotion 1.202e-02 4.563e-03 2.633 0.008618 **
## EducationF2 5.509e-02 4.029e-02 1.367 0.171890
## EducationF3 3.866e-02 3.703e-02 1.044 0.296734
## EducationF4 2.723e-02 3.999e-02 0.681 0.496069
## EducationF5 7.276e-04 7.225e-02 0.010 0.991967
## EducationFieldLife Sciences -2.043e-01 1.088e-01 -1.877 0.060829 .
## EducationFieldMarketing -1.952e-01 1.157e-01 -1.687 0.091956 .
## EducationFieldMedical -2.194e-01 1.089e-01 -2.014 0.044380 *
## EducationFieldOther -2.006e-01 1.167e-01 -1.718 0.086123 .
## EducationFieldTechnical Degree -1.248e-01 1.134e-01 -1.100 0.271582
## EmployeeNumber -1.713e-05 1.804e-05 -0.949 0.342813
## EnvironmentSatisfaction -2.700e-02 9.904e-03 -2.726 0.006545 **
## GenderMale 1.411e-02 2.203e-02 0.640 0.522186
## PerformanceRatingF4 1.966e-02 4.792e-02 0.410 0.681704
## DepartmentResearch & Development 1.737e-01 1.421e-01 1.222 0.221892
## DepartmentSales 1.763e-01 1.466e-01 1.203 0.229464
## DistanceFromHome 4.372e-03 1.348e-03 3.244 0.001229 **
## Age -3.373e-03 1.678e-03 -2.010 0.044737 *
## BusinessTravelTravel_Frequently 9.354e-02 4.174e-02 2.241 0.025288 *
## BusinessTravelTravel_Rarely 3.207e-02 3.556e-02 0.902 0.367377
## JobInvolvementF2 -2.170e-01 5.087e-02 -4.265 2.23e-05 ***
## JobInvolvementF3 -2.698e-01 4.879e-02 -5.530 4.32e-08 ***
## JobInvolvementF4 -3.114e-01 5.863e-02 -5.312 1.41e-07 ***
## JobLevelF2 -1.422e-01 4.887e-02 -2.910 0.003716 **
## JobLevelF3 -6.000e-02 8.092e-02 -0.741 0.458669
## JobLevelF4 -6.398e-02 1.280e-01 -0.500 0.617223
## JobLevelF5 8.250e-02 1.596e-01 0.517 0.605274
## MaritalStatusMarried 4.050e-02 2.970e-02 1.364 0.173062
## MaritalStatusSingle 3.735e-02 4.670e-02 0.800 0.424115
## NumCompaniesWorked 1.878e-02 4.956e-03 3.790 0.000162 ***
## OverTimeYes 2.131e-01 2.385e-02 8.935 < 2e-16 ***
## RelationshipSatisfactionF2 -6.246e-02 3.456e-02 -1.807 0.071115 .
## RelationshipSatisfactionF3 -8.228e-02 3.132e-02 -2.627 0.008771 **
## RelationshipSatisfactionF4 -8.683e-02 3.101e-02 -2.801 0.005224 **
## StockOptionLevelF1 -1.363e-01 3.622e-02 -3.762 0.000180 ***
## StockOptionLevelF2 -1.286e-01 4.880e-02 -2.635 0.008571 **
## StockOptionLevelF3 -1.341e-02 5.562e-02 -0.241 0.809505
## TrainingTimesLastYear -1.758e-02 8.571e-03 -2.052 0.040528 *
## YearsAtCompany 4.828e-03 4.037e-03 1.196 0.232136
## YearsInCurrentRole -6.639e-03 5.061e-03 -1.312 0.189991
## YearsWithCurrManager -5.525e-03 4.998e-03 -1.105 0.269302
## HourlyRate 5.815e-04 5.391e-04 1.079 0.280996
## MonthlyRate -3.404e-07 1.537e-06 -0.221 0.824782
## DailyRate -2.133e-05 2.708e-05 -0.788 0.431163
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3097 on 808 degrees of freedom
## Multiple R-squared: 0.3402, Adjusted R-squared: 0.2903
## F-statistic: 6.828 on 61 and 808 DF, p-value: < 2.2e-16
Naive Bayes Find the average accuracy of 100 train/test splits Model determined from LM of all dataset variables Selection from those that showed signifiance from the model
# Use for the train/test & preditcions
NB = cs2 %>% select(Attrition,TotalWorkingYears,MonthlyIncome,YearsSinceLastPromotion, PerformanceRatingF, Age, OverTime, RelationshipSatisfactionF, JobInvolvementF , BusinessTravel)
#testNB2 will be used for the prediction of the attritions
testNB2 = cs2cmpst %>% select(ID,TotalWorkingYears,MonthlyIncome,YearsSinceLastPromotion, PerformanceRating, Age, OverTime, RelationshipSatisfaction, JobInvolvement, BusinessTravel)
testNB2$PerformanceRatingF = as.factor(testNB2$PerformanceRating)
testNB2$RelationshipSatisfactionF = as.factor(testNB2$RelationshipSatisfaction)
testNB2$JobInvolvementF = as.factor(testNB2$JobInvolvement)
AccHolder = numeric(100)
SensHolder = numeric(100)
SpecHolder = numeric(100)
for (seed in 1:100)
{
set.seed(seed)
trainIndicies = sample(1:dim(NB)[1],round(.75 * dim(NB)[1]))
trainNB = NB[trainIndicies,]
testNB = NB[-trainIndicies,]
modelNB = naiveBayes(trainNB[,c("TotalWorkingYears","MonthlyIncome","YearsSinceLastPromotion","PerformanceRatingF","Age","OverTime","RelationshipSatisfactionF","JobInvolvementF","BusinessTravel")], trainNB$Attrition)
PredNB = table(predict(modelNB,testNB), testNB$Attrition)
CMNB = confusionMatrix(table(testNB$Attrition,predict(modelNB, testNB[,c("TotalWorkingYears","MonthlyIncome","YearsSinceLastPromotion","PerformanceRatingF","Age","OverTime","RelationshipSatisfactionF","JobInvolvementF","BusinessTravel")])))
AccHolder[seed] = CMNB$overall[1]
SensHolder[seed] = CMNB$byClass[1]
SpecHolder[seed] = CMNB$byClass[2]
}
#Plot of the 100 mean Accuracies
MACC <- data.frame(AccHolder)
MACC %>% ggplot(aes(x=AccHolder))+
geom_histogram(fill="blue",color="black",bins=40) +
ggtitle("Mean Accuracy =",round(mean(AccHolder), digits = 4)) +
xlab("Model Accuracies") +
ylab("Frequency")
#Plot of the 100 mean Sensativities
MSEN <- data.frame(SensHolder)
MSEN %>% ggplot(aes(x=SensHolder))+
geom_histogram(fill="red",color="black", bins=40) +
ggtitle("Mean Sensitivity =",round(mean(SensHolder), digits = 4)) +
xlab("Model Sensitivities") +
ylab("Frequency")
#Plot of the 100 mean Specificities
MSPC <- data.frame(SpecHolder)
MSPC %>% ggplot(aes(x=SpecHolder))+
geom_histogram(fill="green",color="black",bins=40) +
ggtitle("Mean Specificity =",round(mean(SpecHolder), digits = 4)) +
xlab("Model Specificities") +
ylab("Frequency")
#Model to use for the creatio of the predicted attritions
modelNB2 = naiveBayes(NB[,c("TotalWorkingYears","MonthlyIncome","YearsSinceLastPromotion","PerformanceRatingF","Age","OverTime","RelationshipSatisfactionF","JobInvolvementF","BusinessTravel")], NB$Attrition)
#Create the Attrition Prediction CSV file
Case2PredictionsAdyanthayaAttritions <- data.frame(testNB2 %>% select(ID),predict(modelNB,testNB2))
Case2PredictionsAdyanthayaAttritions <- setNames(Case2PredictionsAdyanthayaAttritions, c("ID","Attrition"))
#write to csv file
write.csv(Case2PredictionsAdyanthayaAttritions, "Case2PredictionsAdyanthayaAttritions.csv", row.names = F)
#Accuracy
mean(AccHolder)
## [1] 0.8541743
sd(AccHolder)/sqrt(100)
## [1] 0.002240888
#Sensitivity
mean(SensHolder)
## [1] 0.8751597
sd(SensHolder)/sqrt(100)
## [1] 0.002156709
#Specificity
mean(SpecHolder)
## [1] 0.6169045
sd(SpecHolder)/sqrt(100)
## [1] 0.01309779
The mean accuracy of the model was calculated to be 85.4%. This reflects the number of correctly classified observations. The mean sensitivity of the model was calculated to be 87,5%. This reflects the number of correctly identified attritions that are truly employee attritions. The mean specificity of the model was calculated to be 61.7%. This reflects the number of correctly identified non-attritions that are truly employee non-attritions.
knn Find the average accuracy of 100 train/test splits Model determined from LM of all dataset variables Selection from those that showed signifiance from the model
cs2knn <- cs2 %>% select(Attrition, Age, BusinessTravel, MonthlyIncome, OverTime, TWYcut, JobInvolvement, PerformanceRating, RelationshipSatisfaction, WorkLifeBalance, YearsSinceLastPromotion, NumCompaniesWorked, StockOptionLevel, TrainingTimesLastYear, DistanceFromHome, EducationField)
cs2knn$Attrition <- ifelse(cs2knn$Attrition=="Yes",1,0)
cs2knn$OverTime <- ifelse(cs2knn$OverTime=="Yes",1,0)
cs2knn$BusinessTravel <- ifelse(cs2knn$BusinessTravel=="Non-Travel",1,
ifelse(cs2knn$BusinessTravel=="Travel_Rarely",2,3))
#cs2knn$MaritalStatus <- ifelse(cs2knn$MaritalStatus=="Single",1,
# ifelse(cs2knn$MaritalStatus=="Married",2,3))
cs2knn$EducationField <- ifelse(cs2knn$EducationField=="Life Sciences",1,
ifelse(cs2knn$EducationField=="Marketing",2,
ifelse(cs2knn$EducationField=="Medical",3,
ifelse(cs2knn$EducationField=="Technical Degree",4,5))))
cs2knn$Age = as.numeric(cs2knn$Age)
cs2knn$MonthlyIncome = as.numeric(cs2knn$MonthlyIncome)
#cs2knn$TotalWorkingYears = as.numeric(cs2knn$TotalWorkingYears)
cs2knn$JobInvolvement = as.numeric(cs2knn$JobInvolvement)
cs2knn$PerformanceRating = as.numeric(cs2knn$PerformanceRating)
cs2knn$RelationshipSatisfaction = as.numeric(cs2knn$RelationshipSatisfaction)
cs2knn$WorkLifeBalance = as.numeric(cs2knn$WorkLifeBalance)
cs2knn$YearsSinceLastPromotion = as.numeric(cs2knn$YearsSinceLastPromotion)
cs2knn$NumCompaniesWorked = as.numeric(cs2knn$NumCompaniesWorked)
cs2knn$StockOptionLevel = as.numeric(cs2knn$StockOptionLevel)
#cs2knn$YearsWithCurrManager = as.numeric(cs2knn$YearsWithCurrManager)
cs2knn$TrainingTimesLastYear = as.numeric(cs2knn$TrainingTimesLastYear)
cs2knn$DistanceFromHome = as.numeric(cs2knn$DistanceFromHome)
cs2knn$TWYcut = as.numeric(cs2knn$TWYcut)
set.seed(1)
iterations = 100
numks = 5
splitPerc = .75
masterAcc = matrix(nrow = iterations, ncol = numks)
masterSens = matrix(nrow = iterations, ncol = numks)
masterSpec = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
trainIndicies = sample(1:dim(cs2knn)[1],round(splitPerc * dim(cs2knn)[1]))
trainKnn = cs2knn[trainIndicies,]
testKnn = cs2knn[-trainIndicies,]
for(i in 1:numks)
{
modelKnn = knn(trainKnn[,c("Attrition", "Age", "BusinessTravel", "MonthlyIncome",
"OverTime", "TWYcut", "JobInvolvement",
"PerformanceRating", "RelationshipSatisfaction","WorkLifeBalance", "YearsSinceLastPromotion","NumCompaniesWorked","StockOptionLevel","TrainingTimesLastYear","DistanceFromHome","EducationField")],
testKnn[,c("Attrition", "Age", "BusinessTravel", "MonthlyIncome", "OverTime",
"TWYcut", "JobInvolvement", "PerformanceRating",
"RelationshipSatisfaction","WorkLifeBalance","YearsSinceLastPromotion","NumCompaniesWorked","StockOptionLevel","TrainingTimesLastYear","DistanceFromHome","EducationField")],
trainKnn$Attrition, prob = TRUE, k=i)
table(modelKnn,testKnn$Attrition)
CMKnn = confusionMatrix(table(modelKnn,testKnn$Attrition))
masterAcc[j,i] = CMKnn$overall[1]
masterSens[j,i] = CMKnn$byClass[1]
masterSpec[j,i] = CMKnn$byClass[2]
}
}
#Accuracy
MeanAcc = colMeans(masterAcc)
which.max(MeanAcc)
## [1] 5
max(MeanAcc)
## [1] 0.8237615
#Sensitivity
MeanSens = colMeans(masterSens)
which.max(MeanSens)
## [1] 5
max(MeanSens)
## [1] 0.9695651
#Specificity
MeanSpec = colMeans(masterSpec)
which.max(MeanSpec)
## [1] 1
max(MeanSpec)
## [1] 0.2074003
Unfortunately I was not able to find an ideal model that would improve the specificity.
#JOB ROLE
#Overview of Age, Education, Monthly Income and Business Travel on Job Role
cs2 %>%
select(Age,EducationF, MonthlyIncome, BusinessTravel) %>%
ggpairs(mapping = aes(color = cs2$JobRole))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Impact of Age on Job Role
# Age
cs2 %>%
ggplot(aes(y=Age, color=JobRole)) +
geom_boxplot() +
scale_color_viridis(discrete = TRUE) +
ggtitle("Age & Job Role") +
xlab("") + ylab("Age (Years)")
#Not a normal distribution so use KW test
# H_0: The poulation distributions contain similar medians for JobRole
# H_A: At least one population distribution differs for JobRole
cs2AJR <- kruskal.test(Age~JobRole, data=cs2)
cs2AJR
##
## Kruskal-Wallis rank sum test
##
## data: Age by JobRole
## Kruskal-Wallis chi-squared = 158.99, df = 8, p-value < 2.2e-16
Reject Ho, there is overwhelming evidence that suggests Age has an impact on Job Role. It appears that leadership roles (e.g. directors & executives) tends to have employees that are older in age whereas entry level positions have younger employees.
#Impact of Total Working Years on Job Role
cs2 %>%
ggplot(aes(y=TotalWorkingYears, color=JobRole)) +
geom_boxplot() +
scale_color_viridis(discrete = TRUE) +
ggtitle("Total Working Yeara & Job Role") +
xlab("") + ylab("Total Working Years")
#Not a normal distribution so use KW test
# H_0: The poulation distributions contain similar medians for JobRole
# H_A: At least one population distribution differs for JobRole
cs2TWJR <- kruskal.test(TotalWorkingYears~JobRole, data=cs2)
cs2TWJR
##
## Kruskal-Wallis rank sum test
##
## data: TotalWorkingYears by JobRole
## Kruskal-Wallis chi-squared = 333.98, df = 8, p-value < 2.2e-16
Reject Ho, there is overwhelming evidence that suggests TotalWorkingYears has an impact on Job Role. It appears that leadership roles tends to have employees that have higher total working years whereas entry level positions have lowere total working years.
#Age & Total Working Years impact on Job Role
cs2 %>%
ggplot(aes(x=Age, y=TotalWorkingYears, color=JobRole)) +
geom_point()
cs2 %>%
ggplot(aes(x=Age, y=TotalWorkingYears, color=JobRole)) +
geom_point() +
geom_smooth(method="lm")
## `geom_smooth()` using formula = 'y ~ x'
There is a direct relationship on between Age & Total Working Hours
on Job Role. The job roles were smoothed to the data using fit linear
models. As seen in the individual models, positions of leadership tend
to be of high ages and total working years.
#Education impact on Job Role
cs2 %>%
ggplot(aes(x=EducationF, color=JobRole, fill=JobRole)) +
geom_bar() +
facet_grid(rows = vars(JobRole)) +
scale_fill_viridis(discrete = TRUE) +
ggtitle("Education & Job Role") +
xlab("Education") + ylab("") +
theme(
strip.background = element_blank(),
strip.text.y = element_blank()) +
scale_x_discrete(guide = guide_axis(angle = 45)) +
coord_flip()
freqE <- cs2 %>%
proc_freq("JobRole", "EducationF",
include.row_percent = FALSE,
include.column_percent = FALSE,
include.table_percent = FALSE,
include.column_total = FALSE,
include.row_total = FALSE,
)
freqE <- cbind(freqE$body$dataset$`1`, freqE$body$dataset$`2`, freqE$body$dataset$`3`, freqE$body$dataset$`4`, freqE$body$dataset$`5`)
freqE
## [,1] [,2] [,3] [,4] [,5]
## [1,] 10 17 24 23 2
## [2,] 5 8 9 4 1
## [3,] 23 40 52 35 3
## [4,] 4 10 18 17 2
## [5,] 10 18 38 20 1
## [6,] 3 7 18 16 7
## [7,] 16 30 78 44 4
## [8,] 14 39 70 71 6
## [9,] 13 13 17 10 0
#Ho: Attrition is independent of Job Role
#Ha: Attriton is not independent of Job Role
cs2EJR <- chisq.test(freqE)
## Warning in chisq.test(freqE): Chi-squared approximation may be incorrect
cs2EJR
##
## Pearson's Chi-squared test
##
## data: freqE
## X-squared = 63.285, df = 32, p-value = 0.000804
Reject Ho, there is strong evidence that suggests Education has an impact on Job Role. Not knowing exactly what each of the education categories represent, it may be assumed that the level of education increases with code value. Job roles such as Sales Executative or Research Scientist would be expected to have a higher education level as compared to a Healthcare Representative or Human Resources.
#MONTHLY INCOME
#Fit all of the variables to identify those that contribute to the prediction of monthly income
#fitR = lm(MonthlyIncome~Attrition+TotalWorkingYears+JobRole+JobSatisfactionF+WorkLifeBalanceF+PercentSalaryHike+YearsSinceLastPromotion+EducationF+EducationField+EmployeeNumber+EnvironmentSatisfaction+Gender+PerformanceRatingF+Department+DistanceFromHome+Age+BusinessTravel+JobInvolvementF+JobLevelF+MaritalStatus+NumCompaniesWorked+OverTime+RelationshipSatisfactionF+StockOptionLevelF+TrainingTimesLastYear+YearsAtCompany+YearsInCurrentRole+YearsSinceLastPromotion+YearsWithCurrManager + HourlyRate + MonthlyRate + PercentSalaryHike + DailyRate, data=cs2)
#summary(fitR)
set.seed(5)
trainIndicies = sample(1:dim(cs2)[1],round(.75 * dim(cs2cmpns)[1]),replace = FALSE)
trainR = cs2[trainIndicies,]
testR = cs2[-trainIndicies,]
#Determine the optimal model of variables to fit the monthly income
#Leave one out cross validation
ModelRfit = lm(MonthlyIncome ~ TotalWorkingYears+JobRole+JobLevel+BusinessTravel+EducationF+StockOptionLevelF+Age+Gender+OverTime+NumCompaniesWorked, data=trainR, trControl = trainControl((method = "LOOCV")))
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trControl' will be disregarded
summary(ModelRfit)
##
## Call:
## lm(formula = MonthlyIncome ~ TotalWorkingYears + JobRole + JobLevel +
## BusinessTravel + EducationF + StockOptionLevelF + Age + Gender +
## OverTime + NumCompaniesWorked, data = trainR, trControl = trainControl((method = "LOOCV")))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3786.5 -633.2 13.9 628.4 4134.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -415.087 329.841 -1.258 0.208698
## TotalWorkingYears 45.406 10.609 4.280 2.16e-05 ***
## JobRoleHuman Resources -337.137 302.891 -1.113 0.266106
## JobRoleLaboratory Technician -650.025 192.211 -3.382 0.000765 ***
## JobRoleManager 3995.605 261.884 15.257 < 2e-16 ***
## JobRoleManufacturing Director -163.853 188.401 -0.870 0.384796
## JobRoleResearch Director 4013.477 243.472 16.484 < 2e-16 ***
## JobRoleResearch Scientist -334.351 188.288 -1.776 0.076259 .
## JobRoleSales Executive -35.369 160.491 -0.220 0.825647
## JobRoleSales Representative -422.013 241.469 -1.748 0.081007 .
## JobLevel 2809.779 91.618 30.668 < 2e-16 ***
## BusinessTravelTravel_Frequently 174.107 155.798 1.118 0.264201
## BusinessTravelTravel_Rarely 373.542 132.732 2.814 0.005042 **
## EducationF2 -201.361 150.197 -1.341 0.180519
## EducationF3 -167.674 138.465 -1.211 0.226373
## EducationF4 -137.291 147.288 -0.932 0.351629
## EducationF5 -640.731 273.806 -2.340 0.019591 *
## StockOptionLevelF1 45.201 87.574 0.516 0.605934
## StockOptionLevelF2 -146.268 147.816 -0.990 0.322786
## StockOptionLevelF3 136.529 170.239 0.802 0.422866
## Age 2.937 6.197 0.474 0.635732
## GenderMale 68.361 81.732 0.836 0.403248
## OverTimeYes -44.707 88.758 -0.504 0.614653
## NumCompaniesWorked 22.731 16.632 1.367 0.172196
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1014 on 628 degrees of freedom
## Multiple R-squared: 0.9549, Adjusted R-squared: 0.9532
## F-statistic: 578 on 23 and 628 DF, p-value: < 2.2e-16
confint(ModelRfit)
## 2.5 % 97.5 %
## (Intercept) -1062.811787 232.63728
## TotalWorkingYears 24.572075 66.23970
## JobRoleHuman Resources -931.939423 257.66451
## JobRoleLaboratory Technician -1027.478869 -272.57111
## JobRoleManager 3481.330963 4509.87848
## JobRoleManufacturing Director -533.825645 206.11980
## JobRoleResearch Director 3535.358905 4491.59568
## JobRoleResearch Scientist -704.101496 35.39890
## JobRoleSales Executive -350.533795 279.79562
## JobRoleSales Representative -896.197997 52.17275
## JobLevel 2629.863200 2989.69429
## BusinessTravelTravel_Frequently -131.840956 480.05419
## BusinessTravelTravel_Rarely 112.889203 634.19415
## EducationF2 -496.309595 93.58719
## EducationF3 -439.584859 104.23744
## EducationF4 -426.526625 151.94518
## EducationF5 -1178.417704 -103.04503
## StockOptionLevelF1 -126.771930 217.17366
## StockOptionLevelF2 -436.541419 144.00546
## StockOptionLevelF3 -197.777987 470.83587
## Age -9.232326 15.10577
## GenderMale -92.140615 228.86197
## OverTimeYes -219.005705 129.59173
## NumCompaniesWorked -9.929349 55.39220
#Calculate the RSME of the model
RSME = sqrt(mean(ModelRfit$residuals^2))
RSME
## [1] 995.3789
#Find the MSPE
ModelRPreds = predict(ModelRfit, newdata = testR)
as.data.frame(ModelRPreds)
## ModelRPreds
## 5 2555.388
## 9 2377.410
## 13 9067.504
## 14 2616.021
## 15 2955.271
## 18 5401.720
## 19 2829.058
## 21 6023.114
## 23 4805.207
## 29 9211.908
## 33 16500.643
## 38 6007.967
## 41 16706.794
## 42 5563.804
## 46 2408.356
## 48 8612.765
## 50 2494.343
## 60 2311.995
## 61 9276.224
## 62 8878.475
## 63 2485.677
## 74 2510.807
## 75 9019.753
## 76 5182.894
## 81 6280.055
## 86 12424.973
## 88 2402.318
## 96 6665.195
## 101 2538.296
## 102 8968.668
## 105 16947.512
## 106 2556.812
## 109 6238.143
## 111 2376.805
## 112 16484.100
## 114 5987.123
## 120 6198.984
## 124 5880.195
## 136 5725.864
## 140 8952.101
## 142 2502.557
## 144 6036.339
## 151 6018.867
## 157 6253.234
## 161 5370.265
## 167 16365.541
## 168 15257.413
## 171 5716.754
## 173 9708.467
## 174 8140.047
## 175 16396.339
## 186 2711.759
## 190 5401.777
## 194 5769.220
## 195 2805.627
## 197 2575.036
## 200 2169.027
## 204 2883.221
## 205 2601.934
## 208 6644.281
## 219 2604.071
## 230 12749.602
## 242 18835.120
## 244 2627.268
## 245 6044.969
## 263 2593.205
## 266 2439.317
## 270 5765.990
## 271 5562.680
## 273 5711.876
## 276 19437.610
## 277 2489.560
## 283 6208.804
## 297 5859.724
## 298 5115.765
## 301 5826.174
## 306 2155.944
## 309 6120.286
## 315 6285.826
## 316 5865.985
## 332 9035.881
## 335 19277.347
## 336 5662.261
## 346 12161.326
## 348 5463.773
## 349 2408.141
## 350 12784.009
## 351 13492.387
## 354 5575.673
## 355 9517.657
## 359 5840.699
## 367 5286.473
## 371 5555.318
## 377 2926.677
## 378 6109.957
## 383 6492.835
## 386 13094.210
## 389 5964.525
## 398 2507.197
## 402 5552.727
## 406 16313.339
## 411 2709.940
## 416 5782.154
## 417 2209.965
## 425 5820.520
## 428 2521.502
## 430 6216.943
## 431 5727.675
## 433 5715.763
## 444 2938.633
## 451 5928.115
## 452 2758.951
## 472 2210.042
## 475 2316.992
## 485 10304.998
## 493 6096.325
## 498 2539.969
## 500 5781.991
## 504 8693.803
## 506 5794.540
## 515 2060.299
## 520 2399.425
## 526 9593.799
## 532 6328.551
## 536 2632.602
## 538 2804.916
## 541 5560.478
## 546 2842.847
## 547 9133.201
## 549 13287.609
## 550 2789.485
## 553 5692.312
## 554 3040.849
## 556 6415.308
## 560 5950.864
## 565 2346.764
## 567 5589.901
## 570 12148.857
## 571 6289.915
## 572 2563.690
## 577 3357.546
## 582 15465.615
## 583 6340.071
## 584 5828.852
## 588 2996.191
## 589 2513.366
## 595 8867.502
## 597 9022.098
## 598 5967.489
## 600 2723.442
## 603 12788.483
## 608 5839.350
## 611 5362.358
## 613 2763.155
## 615 8920.440
## 617 2622.928
## 618 5553.931
## 620 1670.008
## 621 1833.264
## 623 2895.961
## 637 16273.052
## 641 2388.009
## 650 2613.910
## 652 6195.281
## 653 13294.712
## 655 2347.022
## 657 13175.994
## 658 16308.980
## 661 5084.119
## 664 2325.830
## 669 5410.953
## 672 5719.523
## 678 5200.338
## 680 3026.800
## 682 2513.307
## 683 6058.983
## 688 2847.733
## 689 2142.830
## 690 2910.299
## 691 5606.358
## 692 12556.234
## 693 6336.301
## 695 5531.835
## 706 6042.120
## 708 2888.603
## 709 2323.404
## 712 16827.638
## 719 5990.827
## 720 8729.080
## 721 8755.275
## 722 6036.881
## 723 2686.813
## 734 5648.433
## 738 6252.169
## 741 5834.646
## 742 13078.963
## 743 5729.797
## 755 13398.838
## 765 2624.768
## 768 9018.549
## 769 6115.853
## 773 9073.715
## 781 2424.913
## 787 13092.046
## 789 19945.213
## 790 2453.169
## 795 5265.184
## 801 5625.700
## 802 6202.320
## 803 16307.615
## 813 5726.088
## 816 3157.770
## 817 5948.408
## 831 4985.476
## 836 8495.194
## 846 5399.206
## 848 9402.251
## 865 8610.787
MSPE = data.frame(Observed = testR$MonthlyIncome, Predicted = ModelRPreds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
mean(MSPE$SquaredResidual)
## [1] 1409174
#Create the Monthly Salary Predicted CSV File
cs2cmpns$EducationF = as.factor(cs2cmpns$Education)
cs2cmpns$StockOptionLevelF = as.factor(cs2cmpns$StockOptionLevel)
Case2PredictionsAdyanthayaSalary <- data.frame(cs2cmpns %>% select(ID),predict(ModelRfit, newdata=cs2cmpns))
Case2PredictionsAdyanthayaSalary <- setNames(Case2PredictionsAdyanthayaSalary, c("ID","MonthlyIncome"))
#write to csv file
write.csv(Case2PredictionsAdyanthayaSalary, "Case2PredictionsAdyanthayaSalary.csv", row.names = F)